# Pckgs -------------------------------------
if (!require ("pacman")) (install.packages("pacman"))
p_load(tidyverse,
janitor, skimr,
here, paint,
readxl,
repurrrsive, # examples of recursive lists
listviewer, # provides an interactive method for viewing the structure of a list.
httr, jsonlite, XML,xml2,
oai, # R client to work with OAI-PMH
citr,
fs)WB Project Data Preprocessing
Work in progress
—————————————————————————-
Data sources
WB Projects & Operations
World Bank Projects & Operations: Data Catalog https://projects.worldbank.org/en/projects-operations/project-search
Accessibility Classification: public under Creative Commons Attribution 4.0
esempio https://datacatalog.worldbank.org/search/dataset/0037800 https://datacatalog.worldbank.org/search/dataset/0037800/World-Bank-Projects---Operations
Raw data
I retrieved manually ALL WB projects approved between FY 1973 and 2023 (last FY incomplete) on 09/22/2022 (WDRs go from 1978-2022) using this example url FY 1978/79 and saved individual .xlsx files in data/raw_data/projects
—————————————————————————
{Ingest Projects data (via API)}
DOESN’T WORK!
1/2 Ingest Projects data (manually split)
I retrieved manually ALL WB projects approved between FY 1973 and 2023 (last FY incomplete) on 09/22/2022 (WDRs go from 1978-2022) using this example url and saved individual .xlsx files in data/raw_data/project
- note the manual download is limited to # = 500
— Load all .xlsx files separately
— Save objs in folder as .Rds files separately
2/2 Ingest Projects data (manually all together)
I retrieved manually ALL WB projects approved between FY 1973 and 2023 (last FY incomplete) on 31/08/2024 using simply the Excel button on this page this WBG Projects and saved HUUUGE .xlsx files in data/raw_data/project2/all_projects_as_of29ago2024.xls
all_projects_as_of29ago2024 <- read_excel("data/raw_data/project2/all_projects_as_of29ago2024.xls",
col_names = FALSE,
skip = 1)
# Nomi delle colonne
cnames <- read_excel("data/raw_data/project2/all_projects_as_of29ago2024.xls",
col_names = FALSE,
skip = 1,
n_max = 2)
# file completo
all_proj <- read_excel("data/raw_data/project2/all_projects_as_of29ago2024.xls",
col_names = TRUE,
skip = 2)
save(all_proj, file = "data/raw_data/project2/all_projects_as_of29ago2024.Rdata") Explore Project mega file
Clean all_proj
# Mess of data format weird in different ways in 2 cols:
# 1947-12-31 12:00:00 # closingdate
# 8/3/1948 12:00:00 AM # closingdate
#
# 1955-03-15T00:00:00Z # boardapprovaldate
# Mutate the date columns to parse the dates, handling different formats and blanks
all_proj_t <- all_proj %>%
# 1) Parsed using parse_date_time() with mdy HMS and mdy HMSp to handle "MM/DD/YYYY HH:MM AM/PM" formats.
mutate(across("closingdate", ~ if_else(
. == "",
NA_POSIXct_, # Return NA for blank entries
parse_date_time(., orders = c("mdy HMS", "mdy HMSp"))
)),
# 2) Parsed using ymd_hms() because it follows the ISO 8601 format (e.g., "1952-04-29T00:00:00Z").
across("boardapprovaldate", ~ if_else(
. == "",
NA_POSIXct_, # Return NA for blank entries
ymd_hms(., tz = "UTC") # Handle ISO 8601 format (e.g., "1952-04-29T00:00:00Z")
))) %>%
mutate(boardapproval_year = year(boardapprovaldate),
boardapproval_month = month(boardapprovaldate)) %>%
mutate(boardapprovalFY = case_when(
boardapproval_month >= 1 & boardapproval_month < 7 ~ boardapproval_year,
boardapproval_month >= 7 & boardapproval_month <= 12 ~ boardapproval_year +1)) %>%
relocate(boardapprovalFY, .after = boardapprovaldate ) %>%
mutate(closingdate_year = year(closingdate),
closingdate_month = month(closingdate)) %>%
mutate(closingdateFY = case_when(
closingdate_month >= 1 & closingdate_month < 7 ~ closingdate_year,
closingdate_month >= 7 & closingdate_month <= 12 ~ closingdate_year +1)) %>%
relocate(closingdateFY, .after = closingdate )
tabyl(all_proj$closingdate)
tabyl(all_proj_t$closingdateFY)
tabyl(all_proj$boardapprovaldate)
tabyl(all_proj_t$boardapprovalFY)Explore who are the ones with no PDO
# Function to count missing values in a subset of columns
count_missing_values <- function(data, columns) {
# Select the subset of columns
df_subset <- data %>% select(all_of(columns))
# Use skimr to skim the data
skimmed <- skim(df_subset)
# Extract the relevant columns for column names and missing values
missing_table <- skimmed %>%
select(skim_variable, n_missing)
# Return the table
return(missing_table)
}
# Use the function on a subset of columns
count_missing_values(all_proj_t, c("pdo", "projectstatusdisplay", "boardapprovalFY", "sector1", "theme1"))
missing_pdo <- all_proj_t %>%
#select(id, pdo, countryname, projectstatusdisplay, lendinginstr, boardapprovalFY, projectfinancialtype) %>%
filter(is.na(pdo))
tabyl(all_proj_t$projectstatusdisplay) %>% adorn_pct_formatting()
tabyl(missing_pdo$projectstatusdisplay) %>% adorn_pct_formatting()
tabyl(all_proj_t$regionname) %>% adorn_pct_formatting()
tabyl(missing_pdo$regionname) %>% adorn_pct_formatting()
tabyl(all_proj_t$boardapprovalFY) %>% adorn_pct_formatting()
tabyl(missing_pdo$boardapprovalFY) %>% adorn_pct_formatting()
tabyl(all_proj_t$projectfinancialtype) %>% adorn_pct_formatting()
tabyl(missing_pdo$projectfinancialtype) %>% adorn_pct_formatting()
tabyl(all_proj_t$sector1) %>% adorn_pct_formatting()
tabyl(missing_pdo$sector1) %>% adorn_pct_formatting()
tabyl(all_proj_t$theme1) %>% adorn_pct_formatting()
tabyl(missing_pdo$theme1) %>% adorn_pct_formatting() # most NA
#Environmental Assessment Category
tabyl(all_proj_t$envassesmentcategorycode) %>% adorn_pct_formatting() # most NA
tabyl(missing_pdo$envassesmentcategorycode) %>% adorn_pct_formatting()
# Environmental and Social Risk
tabyl(all_proj_t$esrc_ovrl_risk_rate) %>% adorn_pct_formatting() # most NA
tabyl(missing_pdo$esrc_ovrl_risk_rate) %>% adorn_pct_formatting()
tabyl(all_proj_t$lendinginstr) %>% adorn_pct_formatting() # Specific Investment Loan 4928 43.9%
tabyl(missing_pdo$lendinginstr) %>% adorn_pct_formatting() # Specific Investment Loan 4928 43.9%Based on some “critical” category, I would say that even if many projects are missing PDO the incidence seems to happen at random, except maybe for lendinginstr specific Investment Loan are missing PDO in 4928 pr (43.9%). Why?
>>>>>> QUI <<<<<<<<<<<<<<<<<<
REDUCED DATASET of PROJECTS
For my purposes it is safe to drop all the projects with missing PDO !
- it turns out there are no Development objectives spelled out until FY2001
—————————————————————————
World Development Reports (WRDs)
- DATA https://datacatalog.worldbank.org/search/dataset/0037800
- INSTRUCTIONS https://documents.worldbank.org/en/publication/documents-reports/api
- Following: (Kaye 2019; Robinson 2017; Robinson and Silge 2022)
Raw data
OAI-PMH
OAI-PMH (Open Archives Initiative Protocol for Metadata Harvest-ing) services, a protocol developed by the Open Archives Initiative (https://en.wikipedia.org/wiki/Open_Archives_Initiative). OAI-PMH uses XML data format transported over HTTP.
- packg: Package
oaiis built on xml2 and httr. - paging: OAI-PMH uses (optionally)
resumptionTokens, with an optional expiration date. These tokens can be used to continue on to the next chunk of data, if the first request did not get to the end.
1. World Bank list of World Development Reports
Genreal OKR link https://openknowledge.worldbank.org/search?spc.page=1&query=%20&scope=3d9bbbf6-c007-5043-b655-04d8a1cfbfb2
https://openknowledge.worldbank.org/entities/publication/5e5ac9f1-71ee-4734-825e-60966658395f 2023 | key takeaways | https://openknowledge.worldbank.org/server/api/core/bitstreams/54de9b54-dc23-43da-9a88-fe94dd5a3c24/content
https://openknowledge.worldbank.org/server/api/core/bitstreams/e1e22749-80c3-50ea-b7e1-8bc332d0c2ff/content
-
World Development Report (WDR);
- (in 2022) https://openknowledge.worldbank.org/handle/10986/2124
- (in 2024) https://openknowledge.worldbank.org/collections/3d9bbbf6-c007-5043-b655-04d8a1cfbfb2?spc.sf=dc.date.issued&spc.sd=DESC
- (in 2024) https://openknowledge.worldbank.org/collections/3d9bbbf6-c007-5043-b655-04d8a1cfbfb2?spc.sf=dc.date.issued&spc.sd=DESC&f.supportedlanguage=en,equals&spc.page=1&spc.rpp=100
—————————————————————————
install.packages("rvest")
library(rvest)link2023 <- "https://openknowledge.worldbank.org/entities/publication/5e5ac9f1-71ee-4734-825e-60966658395f/full"
WDR2023 <- read_html(link2023)
WDR2023 %>%
html_elements(xpath = "")1. World Bank list of World Development Reports
Here I want to extract documents metadata from WBG OKR repository.
Which metadata: + collections: “Books” https://openknowledge.worldbank.org/handle/10986/4 + sub-collections: “Corporate Flagships” https://openknowledge.worldbank.org/handle/10986/2123 + World Development Report (WDR); https://openknowledge.worldbank.org/handle/10986/2124 https://openknowledge.worldbank.org/collections/3d9bbbf6-c007-5043-b655-04d8a1cfbfb2?spc.sf=dc.date.issued&spc.sd=DESC
+ Global Economic Prospects (GEP),
+ Doing Business (DB), and
+ Poverty and Shared Prosperity (PSP).
+ ...
I can search adding a keyword:
+ World Development Report (WDR); + Keyword = “economic development”
… the url would become: https://openknowledge.worldbank.org/handle/10986/2124/discover?filtertype=subject&filter_relational_operator=equals&filter=economic+development
>>>>>> QUI <<<<<<<<<<<<<<<<<<
API response
# DATA https://datacatalog.worldbank.org/search/dataset/0037800
# INSTRUCTIONS https://documents.worldbank.org/en/publication/documents-reports/api
# VARIABLES ----------------------------------------------
# base url for query
base <- "https://openknowledge.worldbank.org/oai/request?verb=ListRecords&metadataPrefix=oai_dc&set=col_10986_"
# base url with resumption token
base0 <- "https://openknowledge.worldbank.org/oai/request?verb=ListRecords&resumptionToken=oai_dc///col_10986_"
n <- 45
subid <- 2124 # WRD
temp_url <- paste0(base, subid)
# response <- httr::GET(temp_url) # xml_document
# response <- httr::content(response, as="parsed")
response <- xml2::read_xml(x = temp_url, encoding = "",
options = "NOBLANKS") # remove blank nodes
class(response)
# Parse a url into its component pieces.
xml2::xml_name(response) # [1] "OAI-PMH"
xml2::xml_name(xml_children(response)) # [1] "responseDate" "request" "ListRecords"
# print the full path directory:
response %>% xml_find_all( '//*') %>% xml_path()
# check length of id
#======# DON'T KNOW HOW SHE NEW THE NAME #======#
# "/*/*[3]/*[1]/*[2]/oai_dc:dc/dc:identifier[1]"
r_id <- xml2::xml_find_all(response, ".//d1:identifier") %>% xml_text()
l <- length(r_id)
print(paste("length of total list:", l))
# #======# check length of subject ??? #======#
# "/*/*[3]/*[1]/*[2]/oai_dc:dc/dc:subject[216]"
r_subject <- xml2::xml_find_all(response, ".//d1:subject") %>% xml_text() # empty
r_subject <- xml2::xml_find_all(response, ".//subject") %>% xml_text() # empty
# START NAVIGATION
r_root <- xml2::xml_root(response ) # Returns root node
r_root
r_cdr <- xml2::xml_children(response ) # Access the children nodes
r_cdr
str(r_cdr)
# all records as a list of lists
r_cdr_l <- xml2::as_list(r_cdr)
str(r_cdr_l)
# only the info in record (l1-of - l45)
r_cdr_l_records <- r_cdr_l[3]
# only records as a list of 45
records <- r_cdr_l_records[[1]]
records[[1]][["metadata"]][["dc"]][["subject[1]"]]
# name the records
names(records) <- paste0 ("WRD_", 1:length(records))
records_names <- c(names(records))
class(records_names)Parsing xml ONE INDICATOR AT A TIME (header)
… (form header) identifier
# # # identifier of record 1
# # records[["WRD_1"]][["header"]][["identifier"]]
# # # identifier of record 2
# # records[["WRD_2"]][["header"]][["identifier"]]
#
# # all identifier
#
# # 1) Prepare empty vectors
# id_l <- vector(mode = "character", length = length(records))
# id_v <- vector(mode = "character", length = length(records))
# # 2)
# for(i in seq_along(records)) {
# # extract ALL identifier LIST LIST
# id_l[[i]] <- records[[i]][["header"]][["identifier"]]
# # transform list in vector
# id_v[[i]] <- unlist(id_l[[i]])
# }
#
# # 3)
# id_v_t <- tibble::as_tibble_col(id_v, column_name ="WDR_hd_id")
# df_header_identifier <- id_v_t … (form header) setSpec
# # 0) set variables
# meta_item <- "setSpec"
# l <- records
#
# # 1) Prepare empty output
# item_l <- list(mode = "character", length = length(l))
# item_v <- vector(mode = "character", length = length(l))
#
# # 2) ET loop
# for(i in seq_along(l)) {
# # extract ALL setSpec LIST LIST
# item_l[[i]] <- l[[i]][["header"]][[meta_item]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# # 3) set column name
# column_name <- paste0("WDR_hd_", meta_item)
#
# # 4) make it a tibble
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
#
# # 5) rename result df
# # assign("new.name",old.name)
# assign(paste0("df_header_", meta_item),item_v_t)
#
# df_header_setSpec <- item_v_t # non so se serve a qlc Parsing xml ONE INDICATOR AT A TIME (metadata)
- here I am going to list one level up
records[["WRD_1"]][["metadata"]][["dc"]][["item]] - Problem: multiple repetition of some “items”
… (form metadata) title
# # EG all title(s)
# # records[["WRD_1"]][["metadata"]][["dc"]][["title"]]
#
# # 1) Prepare empty ouppt
# item_l <- list(mode = "character", length = length(records))
# item_v <- vector(mode = "character", length = length(records))
# # 2)
# for(i in seq_along(records)) {
# # extract ALL identifier LIST LIST
# item_l[[i]] <- records[[i]][["metadata"]][["dc"]][["title"]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# column_name <- paste0("WRD_mt_", "title")
#
# # 3)
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
# df_meta_title <- item_v_t … (form metadata) creator
# # EG all creator(s)
# # records[["WRD_1"]][["metadata"]][["dc"]][["creator"]]
#
# # 0) choose metat_item
# meta_item <- "creator"
#
# # 1) Prepare empty ouppt
# item_l <- list(mode = "character", length = length(records))
# item_v <- vector(mode = "character", length = length(records))
# # 2)
# for(i in seq_along(records)) {
# # extract ALL identifier LIST LIST
# item_l[[i]] <- records[[i]][["metadata"]][["dc"]][[meta_item]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# # 3) set column name
# column_name <- paste0("WRD_mt_", meta_item)
#
# # 3)
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
# df_meta_creator<- item_v_t … (form metadata) identifier
# # 0) set variables
# meta_item <- "identifier"
# l <- records
#
# # 1) Prepare empty output
# item_l <- list(mode = "character", length = length(l))
# item_l
# item_v <- vector(mode = "character", length = length(l))
# item_v
# # 2) ET loop
# for(i in seq_along(l)) {
# # extract ALL identifier LIST LIST
# item_l[[i]] <- l[[i]][["metadata"]][["dc"]][[meta_item]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# # 3) set column name
# column_name <- paste0("WRD_mt_", meta_item)
#
# # 4) make it a tibble
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
#
# # 5) rename result df
# # assign("new.name",old.name)
# # assign(paste0("df_meta_", meta_item),item_v_t)
# df_meta_identifier <- item_v_t… (form metadata) description
# # 0) set variables
# meta_item <- "description"
# l <- records
#
# # 1) Prepare empty output
# item_l <- list(mode = "character", length = length(l))
# item_l
# item_v <- vector(mode = "character", length = length(l))
# item_v
# # 2) ET loop
# for(i in seq_along(l)) {
# # extract ALL description LIST LIST
# item_l[[i]] <- l[[i]][["metadata"]][["dc"]][[meta_item]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# # 3) set column name
# column_name <- paste0("WRD_mt_", meta_item)
#
# # 4) make it a tibble
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
#
# # 5) rename result df
# # assign("new.name",old.name)
# assign(paste0("df_meta_", meta_item),item_v_t)… (form metadata) date
# # 0) set variables
# meta_item <- "date"
# l <- records
#
# # 1) Prepare empty output
# item_l <- list(mode = "character", length = length(l))
# item_l
# item_v <- vector(mode = "character", length = length(l))
# item_v
# # 2) ET loop
# for(i in seq_along(l)) {
# # extract ALL date LIST LIST
# item_l[[i]] <- l[[i]][["metadata"]][["dc"]][[meta_item]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# # 3) set column name
# column_name <- paste0("WRD_mt_", meta_item)
#
# # 4) make it a tibble
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
#
# # 5) rename result df
# # assign("new.name",old.name)
# assign(paste0("df_meta_", meta_item),item_v_t)… (form metadata) subject
# # 0) set variables
# meta_item <- "subject"
# l <- records
#
# # 1) Prepare empty output
# item_l <- list(mode = "character", length = length(l))
# item_l
# item_v <- vector(mode = "character", length = length(l))
# item_v
# # 2) ET loop
# for(i in seq_along(l)) {
# # extract ALL subject LIST LIST
# item_l[[i]] <- l[[i]][["metadata"]][["dc"]][[meta_item]]
# # transform list in vector
# item_v[[i]] <- unlist(item_l[[i]])
# }
# # 3) set column name
# column_name <- paste0("WRD_mt_", meta_item)
#
# # 4) make it a tibble
# item_v_t <- as_tibble_col(item_v, column_name =column_name)
#
# # 5) rename result df
# # assign("new.name",old.name)
# assign(paste0("df_meta_", meta_item),item_v_t)same but as FUNCTION -> YEP!!!!!!!
####… ALL from header (func)
# ---------- FUNCTION to create a column of Titles --> DOES NOT Work!
# source(here::here ("R", "turn_list_to_tibble.R") )
# ------------- USE FUNCTION (called INSIDE THE `assign`)
## 1) input: header --> identifier
list <- records
item_name <- "identifier"
where <- "header" # or "header"
df_name <- paste("col", item_name, sep = "_")
## function call (that changes the name of output on the fly )
assign(df_name, value = turn_list_to_tibble(list, where, item_name ))
## 2) input: header --> setSpec
list <- records
item_name <- "setSpec"
where <- "header" # or "header"
df_name <- paste("col", item_name, sep = "_")
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col", item_name, sep = "_")
assign(df_name, value = df)####… ALL from meta (func)
## ------- --------- 1) input: meta --> identifier
list <- records
item_name <- "identifier"
where <- "meta" # or "meta"
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col_m", item_name, sep = "_")
assign(df_name, value = df)
## --------- 2) input: meta --> title
list <- records
item_name <- "title"
where <- "meta" # or "meta"
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col_m", item_name, sep = "_")
assign(df_name, value = df)
## --------- 3) input: meta --> date
list <- records
item_name <- "date"
where <- "meta" # or "meta"
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col_m", item_name, sep = "_")
assign(df_name, value = df)
## --------- 4) input: meta --> creator
list <- records
item_name <- "creator"
where <- "meta" # or "meta"
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col_m", item_name, sep = "_")
assign(df_name, value = df)
## --------- 5) input: meta --> subject
list <- records
item_name <- "subject"
where <- "meta" # or "meta"
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col_m", item_name, sep = "_")
assign(df_name, value = df)
## --------- 6) input: meta --> description
list <- records
item_name <- "description"
where <- "meta" # or "meta"
## function call (that changes the name of output on the fly )
df<- turn_list_to_tibble(list, where, item_name )
# rename ouptut
df_name <- paste("col_m", item_name, sep = "_")
assign(df_name, value = df)… bind ALL - ALL cols into 1 tibble
list_col <- ls(pattern = '^col_', all.names = TRUE)
list_col
cat(noquote(list_col) )
df_metadata <- bind_cols(col_identifier,
col_m_identifier,
col_m_title,
col_m_date,
col_m_creator,
col_m_subject,
col_m_description,
col_setSpec, .name_repair = "unique"
) %>%
janitor::clean_names()save in data/derived_data
Subject / keywords problem
Following SO Rsponse
—- DONT RUN —–
xml_find_all(doc, ".//ArchivedIncident") %>% # iterate over each incident
map_df(~{
set_names(
xml_find_all(.x, ".//value/value") %>% xml_text(), # get entry values
xml_find_all(.x, ".//key") %>% xml_text() # get entry keys (column names)
) %>%
as.list() %>% # turn named vector to list
flatten_df() %>% # and list to df
mutate(ID = xml_attr(.x, "ID")) # add id
}) %>%
type_convert() %>% # let R convert the values for you
select(ID, everything()) # get it in the order you likely want
## # A tibble: 2 x 5
## ID TEST1 TEST2 TEST3 TEST4
## <int> <chr> <int> <chr> <chr>
## 1 100 <NA> 12 A <NA>
## 2 101 BLAH NA <NA> <NA>
# ---- my try
child <- xml_child(response, 3)
xml_find_all(child, ".//record") %>% # iterate over each incident
map_df(~{
set_names(
xml_find_all(.x, ".//subject/subject") %>% xml_text(), # get entry values
xml_find_all(.x, ".//metadata") %>% xml_text() # get entry keys (column names)
) %>%
as.list() %>% # turn named vector to list
flatten_df() %>% # and list to df
mutate(ID = xml_attr(.x, "ID")) # add id
}) %>%
type_convert() %>% # let R convert the values for you
select(ID, everything()) # get it in the order you likely wantHere I see keywords
2022: https://openknowledge.worldbank.org/handle/10986/36883?show=full
oai:openknowledge.worldbank.org:10986/2586
# read WRD metadata
WDR <- readr::read_rds(here::here("data", "raw_data", "WDR.rds" ))
WDR <- WDR %>%
# Extract only the portion of string AFTER the backslash {/}
mutate(id = stringr::str_extract(doc_mt_identifier_1, "[^/]+$") ) %>%
dplyr::relocate(id, .before = doc_mt_identifier_1) %>%
mutate(url_keys = paste0("https://openknowledge.worldbank.org/handle/10986/", id , "?show=full") )%>%
dplyr::relocate(url_keys, .before = doc_mt_identifier_1)
WDR$id[1]
WDR$url_keys[1]
WDR$id[45]
for (i in 1:45){
print (WDR$url_keys[i] )
}REVIEW —
Go back and try to extract dates & keywords
REVIEW —
1. World Bank Projects & Operations:
—- DONT RUN —–
Wrangling the text
Following: (Kaye 2019; Robinson 2017)
Tokenization
A token is a meaningful unit of text, such as a word, that we are interested in using for analysis
bigrams
connections
Following the example of David Robinson on HN titles
Data sources:
- World Bank Projects & Operations: https://datacatalog.worldbank.org/search/dataset/0037800 https://datacatalog.worldbank.org/search/dataset/0037800/World-Bank-Projects—Operations
- Accessibility Classification: public under Creative Commons Attribution 4.0
- World Bank - World Development Reports
- Accessibility Classification:
Acknowledgements
- Computing for Social Science Course
-
Stephanie Tran project who created the function
R/f_scrape_WB-OKR.R - Renu Khandelwal tutorial
—————————————————————————-
Reference Tutorials
(Robinson and Silge 2022) (LDAL 2022) (edureka?!2019)
Benjamin Soltoff: Computing 4 Social Sciences - API
Benjamin Soltoff: Computing 4 Social Sciences - text analysis
Ben Schmidt Book Humanities Crurse Ben Schmidt Book Humanities
- ✔️ MEDIUM articles: common words, pairwise correlations - 2018-12-04
- TidyTuesday Tweets - 2019-01-07
- Wine Ratings - 2019-05-31 Lasso regression | sentiment lexicon,
- Simpsons Guest Stars 2019-08-30 geom_histogram
- Horror Movies 2019-10-22 explaining glmnet package | Lasso regression
- The Office 2020-03-16 geom_text_repel from ggrepel | glmnet package to run a cross-validated LASSO regression
- Animal Crossing 2020-05-05 Using geom_line and geom_point to graph ratings over time | geom_text to visualize what words are associated with positive/negative reviews |topic modelling